home *** CD-ROM | disk | FTP | other *** search
/ Computer Shopper 242 / Issue 242 - April 2008 - DPCS0408DVD.ISO / Software Money Savers / VirtualDub / Source / VirtualDub-1.7.7-src.7z / src / Priss / source / layer3.cpp < prev    next >
Encoding:
C/C++ Source or Header  |  2007-10-13  |  45.3 KB  |  1,581 lines

  1. //    Priss (NekoAmp 2.0) - MPEG-1/2 audio decoding library
  2. //    Copyright (C) 2003 Avery Lee
  3. //
  4. //    This program is free software; you can redistribute it and/or modify
  5. //    it under the terms of the GNU General Public License as published by
  6. //    the Free Software Foundation; either version 2 of the License, or
  7. //    (at your option) any later version.
  8. //
  9. //    This program is distributed in the hope that it will be useful,
  10. //    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12. //    GNU General Public License for more details.
  13. //
  14. //    You should have received a copy of the GNU General Public License
  15. //    along with this program; if not, write to the Free Software
  16. //    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17. //
  18. ///////////////////////////////////////////////////////////////////////////
  19. //
  20. // MPEG-1/2 layer III decoding
  21. //
  22. // Layer III is fugly compared to layer I/II.  The general stages of decoding
  23. // are:
  24. //
  25. // 1) Sliding window (primitive VBR)
  26. //
  27. //    Data regions of each frame are interpreted as a continuous stream and
  28. //    the encoded values for a frame start 0-511 bytes behind the data region
  29. //    for that frame.
  30. //
  31. // 2) Huffman decoding
  32. //
  33. //    Layer III granules encode 576 frequency lines, composed of 32 subbands
  34. //    of 18 samples each.  The 576 values are broken into four regions: two
  35. //    encoded in pairs using two huffman tables, a third composed of only
  36. //    -1/0/+1 in quadruples, and the highest part of the spectrum all zeroes.
  37. //    When this stage fails, the whole frame is trashed.
  38. //
  39. // 3) Dequantization
  40. //
  41. //    Decoded values are pushed through a nonlinear power ramp and then
  42. //    modulated by scalefactors.  Scalefactors group frequency lines -- not
  43. //    necessarily dividing cleanly between subbands -- differently depending
  44. //    on the block type:
  45. //
  46. //    A) long blocks (usual case) - 21 scalefactor bands
  47. //    B) short blocks (transients) - 12 scalefactor bands for three time
  48. //                                   windows (6 samples each)
  49. //    C) mixed blocks - 2 long bands and 3 windows of 9 bands each.
  50. //
  51. //    Short blocks seldom occur and are not really worth optimizing (one
  52. //    problem is that they only encode one-third of the frequency range).
  53. //    I've never seen mixed blocks.
  54. //
  55. // 4) Stereo processing
  56. //
  57. //    Layer III supports two types of joint stereo, mid/side (MS) and
  58. //    intensity stereo (IS).  In MS, a simple butterfly is used to convert
  59. //    sum and difference signals into stereo; in IS, a mono channel is
  60. //    cast to stereo with different amplitudes on the left and right.
  61. //    When both MS and IS are active, the switch from MS to IS occurs when
  62. //    the data for the right channel ends.
  63. //
  64. //    Priss currently has a small bug in MPEG-2 IS processing: unlike
  65. //    MPEG-1, short blocks in MPEG-2 have the MS-to-IS switch point tracked
  66. //    per window.  MPEG-2 audio with intensity stereo is a bit hard to
  67. //    come by (the LAME build I have doesn't do it), and even harder to get
  68. //    into an MPEG-1 video file, so I don't have a test case for this.
  69. //
  70. // 5) Antialiasing
  71. //
  72. //    I don't really understand this, but rotations are performed between
  73. //    frequency lines in adjacent subbands.  Without it, a sort of "halo"
  74. //    echo effect is heard.  Next.
  75. //
  76. // 6) IMDCT
  77. //
  78. //    The IMDCT, like other DCT transforms, has infinite basis functions and
  79. //    thus performs poorly with transients.  Short blocks combat this by
  80. //    slicing the IMDCT into thirds, thus reducing the length of preecho.
  81. //    Slower transients can also be modeled with the regular 18-point IMDCT
  82. //    by using the attack and release windows (types 1 and 3) instead of
  83. //    the regular sinusoidal window (type 0).
  84. //
  85. // 7) Polyphase filter
  86. //
  87. //    As with layer I and II, it all ends here.  MPEG-1 produces two
  88. //    granules for 1152 samples and MPEG-2 produces one granule of 576
  89. //    samples.
  90.  
  91. #include <math.h>
  92. #include <float.h>
  93. #include "engine.h"
  94. #include "bitreader.h"
  95.  
  96. //#define RDTSC_PROFILE
  97.  
  98. #ifdef RDTSC_PROFILE
  99.  
  100.     #include <windows.h>
  101.  
  102.     static long p_lasttime;
  103.     static long p_frames=0;
  104.     static __int64 p_total=0;
  105.     static __int64 p_scalefac=0;
  106.     static __int64 p_huffdec=0;
  107.     static __int64 p_huffdec1=0;
  108.     static __int64 p_dequan=0;
  109.     static __int64 p_stereo=0;
  110.     static __int64 p_hybrid=0;
  111.     static __int64 p_polyphase=0;
  112.  
  113.     static void __inline profile_set(int) {
  114.         __asm {
  115.             rdtsc
  116.             mov        p_lasttime,eax
  117.         };
  118.     }
  119.  
  120.     static void __inline profile_add(__int64& counter) {
  121.         long diff;
  122.  
  123.         __asm {
  124.             rdtsc
  125.             sub        eax,p_lasttime
  126.             mov        diff,eax
  127.         }
  128.  
  129.         counter += diff;
  130.         p_total += diff;
  131.  
  132.         __asm {
  133.             rdtsc
  134.             mov        p_lasttime,eax
  135.  
  136.         }
  137.     }
  138. #else
  139.  
  140.     #define profile_set(x)
  141.     #define profile_add(x)
  142.  
  143. #endif
  144.  
  145.  
  146. namespace {
  147.     struct LayerIIIRegionSideInfo {
  148.         unsigned    part2_3_length;
  149.         unsigned    big_values;
  150.         unsigned    global_gain;
  151.         unsigned    scalefac_compress;
  152.         bool        window_switching_flag;
  153.         uint8    table_select[3];    // 3 for unswitched, 2 for switched
  154.         union {
  155.             struct {
  156.                 uint8    block_type;
  157.                 bool    mixed_block_flag;
  158.                 uint8    subblock_gain[3];
  159.             } switched;
  160.             struct {
  161.                 uint8    region0_count;
  162.                 uint8    region1_count;
  163.             } unswitched;
  164.         };
  165.         bool        preflag;
  166.         bool        scalefac_scale;
  167.         bool        count1table_select;
  168.     };
  169.     struct LayerIIISideInfo {
  170.         unsigned    main_data_begin;
  171.         bool        scfsi[2][4];
  172.         LayerIIIRegionSideInfo gr[2][2];
  173.     };
  174.  
  175.     void DecodeSideInfoMPEG1(LayerIIISideInfo& si, const uint8 *src, unsigned nch) {
  176.         VDMPEGAudioBitReader bits(src, 33);
  177.         unsigned gr, ch, i;
  178.  
  179.         si.main_data_begin    = bits.get(9);
  180.  
  181.         bits.get(nch>1 ? 3 : 5);        // skip private bits
  182.  
  183.         // read scale factor selectors (ch x 4 bits)
  184.         for(ch=0; ch<nch; ++ch)
  185.             for(i=0; i<4; ++i)
  186.                 si.scfsi[ch][i] = bits.getbool();
  187.  
  188.         // read global regions (2 x ch x 56 bits)
  189.         for(gr=0; gr<2; ++gr) {
  190.             for(ch=0; ch<nch; ++ch) {
  191.                 LayerIIIRegionSideInfo& rsi = si.gr[gr][ch];
  192.  
  193.                 rsi.part2_3_length            = bits.get(12);
  194.                 rsi.big_values                = bits.get(9);
  195.                 rsi.global_gain                = bits.get(8);
  196.                 rsi.scalefac_compress        = bits.get(4);
  197.                 rsi.window_switching_flag    = bits.getbool();
  198.  
  199.                 if (rsi.window_switching_flag) {
  200.                     rsi.switched.block_type            = bits.get(2);
  201.                     rsi.switched.mixed_block_flag    = bits.getbool();
  202.                     for(unsigned r=0; r<2; ++r)
  203.                         rsi.table_select[r] = bits.get(5);
  204.                     rsi.table_select[2] = 0;
  205.                     for(unsigned w=0; w<3; ++w)
  206.                         rsi.switched.subblock_gain[w] = bits.get(3);
  207.                 } else {
  208.                     for(unsigned r=0; r<3; ++r)
  209.                         rsi.table_select[r] = bits.get(5);
  210.                     rsi.unswitched.region0_count = bits.get(4);
  211.                     rsi.unswitched.region1_count = bits.get(3);
  212.                 }
  213.  
  214.                 rsi.preflag = bits.getbool();
  215.                 rsi.scalefac_scale    = bits.getbool();
  216.                 rsi.count1table_select = bits.getbool();
  217.             }
  218.         }
  219.     }
  220.     
  221.     void DecodeSideInfoMPEG2(LayerIIISideInfo& si, const uint8 *src, unsigned nch) {
  222.         VDMPEGAudioBitReader bits(src, 33);
  223.  
  224.         si.main_data_begin    = bits.get(8);
  225.  
  226.         bits.get(nch>1 ? 2 : 1);        // skip private bits
  227.  
  228.         for(unsigned ch=0; ch<nch; ++ch) {        // 63 bits per channel
  229.             LayerIIIRegionSideInfo& rsi = si.gr[0][ch];
  230.  
  231.             rsi.part2_3_length            = bits.get(12);
  232.             rsi.big_values                = bits.get(9);
  233.             rsi.global_gain                = bits.get(8);
  234.             rsi.scalefac_compress        = bits.get(9);
  235.             rsi.window_switching_flag    = bits.getbool();
  236.  
  237.             if (rsi.window_switching_flag) {
  238.                 rsi.switched.block_type            = bits.get(2);
  239.                 rsi.switched.mixed_block_flag    = bits.getbool();
  240.                 for(unsigned r=0; r<2; ++r)
  241.                     rsi.table_select[r] = bits.get(5);
  242.                 rsi.table_select[2] = 0;
  243.                 for(unsigned w=0; w<3; ++w)
  244.                     rsi.switched.subblock_gain[w] = bits.get(3);
  245.             } else {
  246.                 for(unsigned r=0; r<3; ++r)
  247.                     rsi.table_select[r] = bits.get(5);
  248.                 rsi.unswitched.region0_count = bits.get(4);
  249.                 rsi.unswitched.region1_count = bits.get(3);
  250.             }
  251.  
  252.             rsi.scalefac_scale    = bits.getbool();
  253.             rsi.count1table_select = bits.getbool();
  254.         }
  255.     }
  256.  
  257.     static struct costable_t {
  258.         costable_t() {
  259.             for(unsigned i=0; i<36; ++i) {
  260.                 for(unsigned j=0; j<18; ++j) {
  261.                     v[i][j] = (float)cos((3.1415926535897932 / 72) * (i+i+1+18) * (j+j+1));
  262.                 }
  263.             }
  264.         }
  265.  
  266.         float v[36][18];
  267.     } costable;
  268.  
  269.     ////////////////////////////////////
  270.  
  271. #if 0
  272.     void IMDCT_18(const float *in, float (*out)[2][32], float *overlap, const float *window) {
  273.         float t[36];
  274.  
  275.         for(unsigned i=0; i<36; ++i) {
  276.             double s=0;
  277.             for(unsigned j=0; j<18; ++j) {
  278.                 s += in[j] * costable.v[i][j]; //cos((3.1415926535897932 / 72) * (i+i+1+18) * (j+j+1));
  279.             }
  280.  
  281.             t[i] = (float)s;
  282.         }
  283.  
  284.         for(unsigned k=0; k<18; ++k) {
  285.             out[k][0][0] = overlap[k] + t[k]*window[k];
  286.             overlap[k] = t[k+18]*window[k+18];
  287.         }
  288.     }
  289. #else
  290.  
  291.     // The algorithm for this IMDCT comes from HP Laboratories report
  292.     // HPL-2000-66, "Faster MPEG-1 Layer III Audio Decoding" by
  293.     // Scott B. Marovich.  It first uses a technique from the Lee
  294.     // decomposition to shift the IMDCT into an IDCT, then factors
  295.     // the 18-point IDCT down to 4-point and 5-point IDCTs.  Profiling
  296.     // seems to indicate that this is not as fast as FreeAmp's IMDCT,
  297.     // which appears to use a direct IMDCT factorization instead, but
  298.     // layer III decoding is rather rare in MPEG-1 video files and
  299.     // this runs fast enough.
  300.  
  301.     void IMDCT_9(const float (*b)[2], float *d) {
  302.         static const float c1 = 0.93969262078591f;    // cos(1*(pi/9))
  303.         static const float c2 = 0.76604444311898f;    // cos(2*(pi/9))
  304.         static const float c4 = 0.17364817766693f;    // cos(4*(pi/9))
  305.  
  306.         // 'g' stage (5-point IDCT)
  307.         const float G0 = b[0][0];
  308.         const float G1 = b[2][0];
  309.         const float G2 = b[4][0];
  310.         const float G3 = b[6][0];
  311.         const float G4 = b[8][0];
  312.         const float x0 = G3*0.5f + G0;
  313.         const float x1 = G0 - G3;
  314.         const float x2 = G1 - G2 - G4;
  315.         const float g0 = x0 + c1*G1 + c2*G2 + c4*G4;
  316.         const float g1 = x2*0.5f + x1;
  317.         const float g2 = x0 - c4*G1 - c1*G2 + c2*G4;
  318.         const float g3 = x0 - c2*G1 + c4*G2 - c1*G4;
  319.         const float g4 = x1 - x2;
  320.  
  321.         // 'h prime' stage (4-point IDCT)
  322.         static const float odd[4]={
  323.             0.50771330594287f,
  324.             0.57735026918963f,
  325.             0.77786191343021f,
  326.             1.46190220008154f
  327.         };
  328.  
  329.         static const float covals[4]={    // odd[i] * sin((pi/18)*(2i+1))*(-1)^i
  330.             0.08816349035423f,
  331.             -0.28867513459481f,
  332.             0.59587679629710f,
  333.             -1.37373870972731f,
  334.         };
  335.  
  336.         const float H0 = b[1][0];
  337.         const float H1 = b[1][0] + b[3][0];
  338.         const float H2 = b[3][0] + b[5][0];
  339.         const float H3 = b[5][0] + b[7][0];
  340.         const float y0 = H3*0.5f + H0;
  341.         const float y1 = H1 - H2;
  342.         const float h0 = odd[0]*(y0 + c1*H1 + c2*H2) + b[7][0]*covals[0];
  343.         const float h1 = odd[1]*(0.5f*y1 + H0 - H3)  + b[7][0]*covals[1];
  344.         const float h2 = odd[2]*(y0 - c4*H1 - c1*H2) + b[7][0]*covals[2];
  345.         const float h3 = odd[3]*(y0 - c2*H1 + c4*H2) + b[7][0]*covals[3];
  346.  
  347.         d[0] = g0 + h0;
  348.         d[1] = g1 + h1;
  349.         d[2] = g2 + h2;
  350.         d[3] = g3 + h3;
  351.         d[4] = g4;
  352.         d[5] = g3 - h3;
  353.         d[6] = g2 - h2;
  354.         d[7] = g1 - h1;
  355.         d[8] = g0 - h0;
  356.     }
  357.  
  358.     void IMDCT_18(const float *in, float (*out)[2][32], float *overlap, const float *window) {
  359.         static const float pi = 3.1415926535897932384626433832795f;
  360.  
  361.         float a[18];
  362.         float t[18];
  363.  
  364.         a[0] = in[0];
  365.         a[1] = in[0]+in[1];
  366.         a[2] = in[1]+in[2];
  367.         a[3] = in[2]+in[3];
  368.         a[4] = in[3]+in[4];
  369.         a[5] = in[4]+in[5];
  370.         a[6] = in[5]+in[6];
  371.         a[7] = in[6]+in[7];
  372.         a[8] = in[7]+in[8];
  373.         a[9] = in[8]+in[9];
  374.         a[10] = in[9]+in[10];
  375.         a[11] = in[10]+in[11];
  376.         a[12] = in[11]+in[12];
  377.         a[13] = in[12]+in[13];
  378.         a[14] = in[13]+in[14];
  379.         a[15] = in[14]+in[15];
  380.         a[16] = in[15]+in[16];
  381.         a[17] = in[16]+in[17];
  382.  
  383.         a[17] += a[15];
  384.         a[15] += a[13];
  385.         a[13] += a[11];
  386.         a[11] += a[9];
  387.         a[9] += a[7];
  388.         a[7] += a[5];
  389.         a[5] += a[3];
  390.         a[3] += a[1];
  391.  
  392.         float d[18];
  393.  
  394.         IMDCT_9((float(*)[2])&a[0], d);
  395.         IMDCT_9((float(*)[2])&a[1], d+9);
  396.         static const float coeff_9_to_18[9]={        // 1 / (2 cos((pi/36)(2i+1)))
  397.             0.50190991877167f,
  398.             0.51763809020504f,
  399.             0.55168895948125f,
  400.             0.61038729438073f,
  401.             0.70710678118655f,
  402.             0.87172339781055f,
  403.             1.18310079157625f,
  404.             1.93185165257814f,
  405.             5.73685662283492f,
  406.         };
  407.  
  408.         const float y[9]={
  409.             d[9+0] * coeff_9_to_18[0],
  410.             d[9+1] * coeff_9_to_18[1],
  411.             d[9+2] * coeff_9_to_18[2],
  412.             d[9+3] * coeff_9_to_18[3],
  413.             d[9+4] * coeff_9_to_18[4],
  414.             d[9+5] * coeff_9_to_18[5],
  415.             d[9+6] * coeff_9_to_18[6],
  416.             d[9+7] * coeff_9_to_18[7],
  417.             d[9+8] * coeff_9_to_18[8],
  418.         };
  419.  
  420.         // Note: The second half of this butterfly has been reversed.
  421.         t[ 0] = d[0] + y[0];
  422.         t[ 9] = d[0] - y[0];
  423.         t[ 1] = d[1] + y[1];
  424.         t[10] = d[1] - y[1];
  425.         t[ 2] = d[2] + y[2];
  426.         t[11] = d[2] - y[2];
  427.         t[ 3] = d[3] + y[3];
  428.         t[12] = d[3] - y[3];
  429.         t[ 4] = d[4] + y[4];
  430.         t[13] = d[4] - y[4];
  431.         t[ 5] = d[5] + y[5];
  432.         t[14] = d[5] - y[5];
  433.         t[ 6] = d[6] + y[6];
  434.         t[15] = d[6] - y[6];
  435.         t[ 7] = d[7] + y[7];
  436.         t[16] = d[7] - y[7];
  437.         t[ 8] = d[8] + y[8];
  438.         t[17] = d[8] - y[8];
  439.  
  440.         // multiplication to convert idct to imdct has already been folded
  441.         // into the windows
  442.  
  443.         // y[0..8]   =  x[9..17]  = t[17..9]
  444.         // y[9..17]  = -x[17..9]  = -t[9..17]
  445.         // y[18..26] = -x[8..0]   = -t[8..0]        (negation folded into window)
  446.         // y[27..35] = -x[0..8]   = -t[0..8]        (negation folded into window)
  447.  
  448. #if 0
  449.         for(unsigned k=0; k<9; ++k) {
  450.             out[k][0][0] = overlap[k] + t[17-k]*window[k];
  451.             out[k+9][0][0] = overlap[k+9] - t[k+9]*window[k+9];
  452.             overlap[k] = t[8-k]*window[k+18];
  453.             overlap[k+9] = t[k]*window[k+27];
  454.         }
  455. #else
  456.         out[0][0][0] = overlap[0] + t[17]*window[0];
  457.         out[1][0][0] = overlap[1] + t[16]*window[1];
  458.         out[2][0][0] = overlap[2] + t[15]*window[2];
  459.         out[3][0][0] = overlap[3] + t[14]*window[3];
  460.         out[4][0][0] = overlap[4] + t[13]*window[4];
  461.         out[5][0][0] = overlap[5] + t[12]*window[5];
  462.         out[6][0][0] = overlap[6] + t[11]*window[6];
  463.         out[7][0][0] = overlap[7] + t[10]*window[7];
  464.         out[8][0][0] = overlap[8] + t[ 9]*window[8];
  465.         out[9][0][0] = overlap[9] - t[ 9]*window[9];
  466.         out[10][0][0] = overlap[10] - t[10]*window[10];
  467.         out[11][0][0] = overlap[11] - t[11]*window[11];
  468.         out[12][0][0] = overlap[12] - t[12]*window[12];
  469.         out[13][0][0] = overlap[13] - t[13]*window[13];
  470.         out[14][0][0] = overlap[14] - t[14]*window[14];
  471.         out[15][0][0] = overlap[15] - t[15]*window[15];
  472.         out[16][0][0] = overlap[16] - t[16]*window[16];
  473.         out[17][0][0] = overlap[17] - t[17]*window[17];
  474.         overlap[0] = t[8]*window[18];
  475.         overlap[1] = t[7]*window[19];
  476.         overlap[2] = t[6]*window[20];
  477.         overlap[3] = t[5]*window[21];
  478.         overlap[4] = t[4]*window[22];
  479.         overlap[5] = t[3]*window[23];
  480.         overlap[6] = t[2]*window[24];
  481.         overlap[7] = t[1]*window[25];
  482.         overlap[8] = t[0]*window[26];
  483.         overlap[9] = t[0]*window[27];
  484.         overlap[10] = t[1]*window[28];
  485.         overlap[11] = t[2]*window[29];
  486.         overlap[12] = t[3]*window[30];
  487.         overlap[13] = t[4]*window[31];
  488.         overlap[14] = t[5]*window[32];
  489.         overlap[15] = t[6]*window[33];
  490.         overlap[16] = t[7]*window[34];
  491.         overlap[17] = t[8]*window[35];
  492. #endif
  493.     }
  494.  
  495.     void IMDCT_18_Null(float (*out)[2][32], float *overlap) {
  496.         for(unsigned k=0; k<18; ++k) {
  497.             out[k][0][0] = overlap[k];
  498.             overlap[k] = 0;
  499.         }
  500.     }
  501. #endif
  502.  
  503.     ////////////////////////////////////////////////////////////
  504.  
  505. #if 0
  506.     void IMDCT_6_3(const float *in, float (*out)[2][32], float *overlap, const float *window) {
  507.         float t[24]={0};
  508.  
  509.         unsigned k;
  510.  
  511.         for(unsigned w=0; w<3; ++w) {
  512.             for(unsigned i=0; i<12; ++i) {
  513.                 double s=0;
  514.                 for(unsigned j=0; j<6; ++j) {
  515.                     s += in[j*3] * cos((3.1415926535897932 / 24) * (i+i+1+6) * (j+j+1));
  516.                 }
  517.  
  518.                 t[i+w*6] += (float)s * window[i];
  519.             }
  520.  
  521.             ++in;
  522.         }
  523.  
  524.         for(k=0; k<6; ++k)
  525.             out[k][0][0] = overlap[k];
  526.  
  527.         for(k=0; k<12; ++k) {
  528.             out[k+6][0][0] = overlap[k+6] + t[k];
  529.             overlap[k] = t[k+12];
  530.         }
  531.  
  532.         for(k=0; k<6; ++k)
  533.             overlap[k+12] = 0;
  534.     }
  535. #else
  536.  
  537.     void IMDCT_6(float dst[12], const float src[18]) {
  538. #if 0
  539.         {
  540.             for(unsigned i=0; i<12; ++i) {
  541.                 double s=0;
  542.                 for(unsigned j=0; j<6; ++j) {
  543.                     s += src[j*3] * cos((3.1415926535897932 / 24) * (i+i+1+6) * (j+j+1));
  544.                 }
  545.  
  546.                 dst[i] = (float)s;
  547.             }
  548.         }
  549. #else
  550.         //////////////
  551.  
  552.         static const float root3div2 = 0.86602540378443864676372317075294f;
  553.         static const float pi = 3.1415926535897932384626433832795f;
  554.  
  555.         static const float oddscale[3]={        // 1/(2*cos(pi/12*[1 3 5])) - part of Lee decomposition of 6pt IDCT
  556.             0.51763809020504f,
  557.             0.70710678118655f,
  558.             1.93185165257814f,
  559.         };
  560.  
  561.         static const float finalscale[6]={        // 1/(2*cos(pi/24*[1:2:11])) - part of conversion from IDCT to IMDCT
  562.             0.504314480290076f,
  563.             0.541196100146197f,
  564.             0.630236207005132f,
  565.             0.821339815852291f,
  566.             1.30656296487638f,
  567.             3.83064878777019f,
  568.         };
  569.  
  570.         float a[6]={src[0], src[0]+src[3], src[3]+src[6], src[6]+src[9], src[9]+src[12], src[12]+src[15]};
  571.         float c1[6] = { a[0], a[2], a[4], a[1], a[1]+a[3], a[3]+a[5] };
  572.         float c2[6]={
  573.             c1[0] + c1[1]*root3div2 + c1[2]*0.5f,
  574.             c1[0] - c1[2],
  575.             c1[0] - c1[1]*root3div2 + c1[2]*0.5f,
  576.  
  577.             (c1[3] + c1[4]*root3div2 + c1[5]*0.5f) * oddscale[0],
  578.             (c1[3]                     - c1[5]     ) * oddscale[1],
  579.             (c1[3] - c1[4]*root3div2 + c1[5]*0.5f) * oddscale[2],
  580.         };
  581.         float b[6]={
  582.             (c2[0]+c2[3]) * finalscale[0],
  583.             (c2[1]+c2[4]) * finalscale[1],
  584.             (c2[2]+c2[5]) * finalscale[2],
  585.             (c2[2]-c2[5]) * finalscale[3],
  586.             (c2[1]-c2[4]) * finalscale[4],
  587.             (c2[0]-c2[3]) * finalscale[5],
  588.         };
  589.  
  590.         dst[0] = b[3];
  591.         dst[1] = b[4];
  592.         dst[2] = b[5];
  593.         dst[3] = -b[5];
  594.         dst[4] = -b[4];
  595.         dst[5] = -b[3];
  596.         dst[6] = -b[2];
  597.         dst[7] = -b[1];
  598.         dst[8] = -b[0];
  599.         dst[9] = -b[0];
  600.         dst[10] = -b[1];
  601.         dst[11] = -b[2];
  602. #endif
  603.     }
  604.  
  605.     // This also comes from the Marovich paper.  It's not as optimized, but
  606.     // short blocks are comparatively rare.
  607.  
  608.     void IMDCT_6_3(const float *in, float (*out)[2][32], float *overlap, const float *window) {
  609.         float t[24]={0};
  610.  
  611.         unsigned k;
  612.  
  613.         for(unsigned w=0; w<3; ++w) {
  614.             float u[12];
  615.  
  616.             IMDCT_6(u, in+w);
  617.  
  618.             for(unsigned i=0; i<12; ++i)
  619.                 t[i+w*6] += (float)(u[i] * window[i]);
  620.         }
  621.  
  622.         for(k=0; k<6; ++k)
  623.             out[k][0][0] = overlap[k];
  624.  
  625.         for(k=0; k<12; ++k) {
  626.             out[k+6][0][0] = overlap[k+6] + t[k];
  627.             overlap[k] = t[k+12];
  628.         }
  629.  
  630.         for(k=0; k<6; ++k)
  631.             overlap[k+12] = 0;
  632.     }
  633. #endif
  634.  
  635.     ////////////////////////////////////
  636.  
  637.     static void DecodeScalefactorsMPEG1(const LayerIIISideInfo& sideinfo, VDMPEGAudioHuffBitReader& bits, uint8 *scalefac_l, uint8 *scalefac_s, unsigned gr, unsigned ch) {
  638.         const LayerIIIRegionSideInfo& rsi = sideinfo.gr[gr][ch];
  639.  
  640.         // read in scalefactors
  641.         static const uint8 slen1_table[16]={0,0,0,0,3,1,1,1,2,2,2,3,3,3,4,4};
  642.         static const uint8 slen2_table[16]={0,1,2,3,0,1,2,3,1,2,3,1,2,3,2,3};
  643.         unsigned i;
  644.         const unsigned slen1 = slen1_table[rsi.scalefac_compress];
  645.         const unsigned slen2 = slen2_table[rsi.scalefac_compress];
  646.  
  647.         if (rsi.window_switching_flag && rsi.switched.block_type == 2) {    // short blocks
  648.             // Scalefactors for short blocks are ordered as
  649.             // scalefac_s[sfb][window].  We store them as one big array for
  650.             // convenience.
  651.  
  652.             if (rsi.switched.mixed_block_flag) {
  653.                 for(i=0; i<8; ++i)                        // long bands 0-7
  654.                     *scalefac_l++ = bits.get(slen1);
  655.  
  656.                 scalefac_s += 3*3;
  657.                 for(i=0; i<9; ++i)                        // short bands 3-5
  658.                     *scalefac_s++ = slen1?bits.get(slen1):0;
  659.  
  660.                 for(i=0; i<18; ++i)                        // short bands 6-11
  661.                     *scalefac_s++ = slen2?bits.get(slen2):0;
  662.             } else {
  663.                 for(i=0; i<18; ++i)                        // short bands 0-5
  664.                     *scalefac_s++ = slen1?bits.get(slen1):0;
  665.  
  666.                 for(i=0; i<18; ++i)
  667.                     *scalefac_s++ = slen2?bits.get(slen2):0;
  668.             }
  669.         } else {        // long blocks
  670.             // Long blocks are split into four bands.  Depending on
  671.             // the scalefactor selectors, some bands will share
  672.             // scalefactors between regions, and some won't.
  673.  
  674.             unsigned sfb = 0;
  675.  
  676.             static const unsigned sfblimit[4]={6,11,16,21};
  677.             unsigned slen[4]={slen1,slen1,slen2,slen2};
  678.  
  679.             for(unsigned r=0; r<4; ++r) {
  680.                 if (!gr || !sideinfo.scfsi[ch][r]) {
  681.                     if (unsigned slen_bits = slen[r]) {
  682.                         for(; sfb<sfblimit[r]; ++sfb)
  683.                             scalefac_l[sfb] = bits.get(slen_bits);
  684.                     } else {
  685.                         for(; sfb<sfblimit[r]; ++sfb)
  686.                             scalefac_l[sfb] = 0;
  687.                     }
  688.                 } else
  689.                     sfb = sfblimit[r];
  690.             }
  691.         }
  692.     }
  693.  
  694.     static void DecodeScalefactorsMPEG2(LayerIIIRegionSideInfo& rsi, VDMPEGAudioHuffBitReader& bits, uint8 *scalefac_l, uint8 *scalefac_s, unsigned mode_ext, unsigned ch) {
  695.         // Tables of bit allocations per scalefactor breakdown type,
  696.         // three cases per type (long, short, mixed).  Note that for
  697.         // the mixed case the first six scalefactor bands (long bands)
  698.         // have been subtracted from the table.
  699.  
  700.         static const uint8 is_on_566_tab[3][4] = {{7,7,7,0}, {12,12,12,0}, {0,15,12,0}};
  701.         static const uint8 is_on_444_tab[3][4] = {{6,6,6,3}, {12,9,9,6}, {0,12,9,6}};
  702.         static const uint8 is_on_430_tab[3][4] = {{8,8,5,0}, {15,12,9,0}, {0,18,9,0}};
  703.         static const uint8 is_off_5544_tab[3][4] = {{6,5,5,5}, {9,9,9,9}, {0,9,9,9}};
  704.         static const uint8 is_off_5540_tab[3][4] = {{6,5,7,3}, {9,9,12,6}, {0,9,12,6}};
  705.         static const uint8 is_off_4300_tab[3][4] = {{11,10,0,0}, {18,18,0,0}, {9,18,0,0}};
  706.  
  707.         unsigned v = rsi.scalefac_compress;
  708.         unsigned set_bits[4];
  709.  
  710.         rsi.preflag = 0;
  711.  
  712.         const uint8 *set_count;
  713.         const unsigned windowtype = rsi.window_switching_flag && rsi.switched.block_type==2 ? rsi.switched.mixed_block_flag ? 2 : 1 : 0;
  714.  
  715.         if ((mode_ext & 1) && ch) {    // second channel w/ intensity stereo enabled
  716.             v >>= 1;            // LSB is used for intensity scale.
  717.  
  718.             if (v < 180) {            // 5 x 6 x 6 = 180 types
  719.                 set_bits[0] = v / 36;
  720.                 set_bits[1] = (v % 36) / 6;
  721.                 set_bits[2] = v % 6;
  722.                 set_bits[3] = 0;
  723.                 set_count = is_on_566_tab[windowtype];
  724.             } else if (v < 244) {    // 4 x 4 x 4 = 64 types
  725.                 v -= 180;
  726.                 set_bits[0] = (v>>4) & 3;
  727.                 set_bits[1] = (v>>2) & 3;
  728.                 set_bits[2] = v & 3;
  729.                 set_bits[3] = 0;
  730.                 set_count = is_on_444_tab[windowtype];
  731.             } else {                // 4 x 3 = 12 types
  732.                 v -= 244;
  733.                 set_bits[0] = v / 3;
  734.                 set_bits[1] = v % 3;
  735.                 set_bits[2] = 0;
  736.                 set_bits[3] = 0;
  737.                 set_count = is_on_430_tab[windowtype];
  738.             }
  739.         } else {                    // first channel or intensity stereo disabled
  740.             if (v < 400) {            // 5 x 5 x 4 x 4 = 400 types
  741.                 set_bits[0] = (v>>4) / 5;
  742.                 set_bits[1] = (v>>4) % 5;
  743.                 set_bits[2] = (v>>2) & 3;
  744.                 set_bits[3] = v & 3;
  745.                 set_count = is_off_5544_tab[windowtype];
  746.             } else if (v < 500) {    // 5 x 5 x 4 = 100 types
  747.                 v -= 400;
  748.                 set_bits[0] = (v>>2) / 5;
  749.                 set_bits[1] = (v>>2) % 5;
  750.                 set_bits[2] = v & 3;
  751.                 set_bits[3] = 0;
  752.                 set_count = is_off_5540_tab[windowtype];
  753.             } else {                // 4 x 3 = 12 types
  754.                 v -= 500;
  755.                 set_bits[0] = v / 3;
  756.                 set_bits[1] = v % 3;
  757.                 set_bits[2] = 0;
  758.                 set_bits[3] = 0;
  759.                 set_count = is_off_4300_tab[windowtype];
  760.                 rsi.preflag = 1;
  761.             }
  762.         }
  763.  
  764.         uint8 *dst = scalefac_l;
  765.  
  766.         if (rsi.window_switching_flag && rsi.switched.block_type == 2) {
  767.             if (rsi.switched.mixed_block_flag) {
  768.                 // The first six scalefactors, which are in the long band of
  769.                 // mixed blocks, are always in the first set of scalefactors
  770.                 // so we just read them now.
  771.  
  772.                 for(unsigned i=0; i<6; ++i)
  773.                     scalefac_l[i] = set_bits[0] ? bits.get(set_bits[0]) : 0;
  774.  
  775.                 scalefac_s += 9;    // skip first three short bands
  776.             }
  777.  
  778.             dst = scalefac_s;
  779.         }
  780.  
  781.         for(unsigned set=0; set<4; ++set) {
  782.             for(unsigned i=0; i<set_count[set]; ++i)
  783.                 *dst++ = set_bits[set] ? bits.get(set_bits[set]) : 0;
  784.         }
  785.     }
  786. };
  787.  
  788. void VDMPEGAudioDecoder::PrereadLayerIII() {
  789.     const bool is_mpeg2 = mHeader.nMPEGVer > 1;
  790.     const unsigned nch = mMode != 3 ? 2 : 1;
  791.  
  792.     // fill Huffman buffer
  793.  
  794.     unsigned sidelen = is_mpeg2 ? (nch>1 ? 17 : 9) : (nch>1 ? 32 : 17);
  795.     unsigned left = mFrameDataSize - sidelen;
  796.     const uint8 *src = mFrameBuffer + sidelen;
  797.  
  798.     while(left > 0) {
  799.         unsigned tc = kL3BufferSize - mL3BufferPos;
  800.         if (tc > left)
  801.             tc = left;
  802.         memcpy(mL3Buffer+mL3BufferPos, src, tc);
  803.         mL3BufferPos = (mL3BufferPos + tc) & (kL3BufferSize-1);
  804.         src += tc;
  805.         left -= tc;
  806.     }
  807. }
  808.  
  809. bool VDMPEGAudioDecoder::DecodeLayerIII() {
  810.     LayerIIISideInfo sideinfo;
  811.     const unsigned nch = mMode != 3 ? 2 : 1;
  812.     const bool is_mpeg2 = mHeader.nMPEGVer > 1;
  813.  
  814.     if (is_mpeg2)
  815.         DecodeSideInfoMPEG2(sideinfo, mFrameBuffer, nch);
  816.     else
  817.         DecodeSideInfoMPEG1(sideinfo, mFrameBuffer, nch);
  818.  
  819. //    static int frameno = -1;
  820. //    ++frameno;
  821.  
  822.     const unsigned sidelen = is_mpeg2 ? (nch>1 ? 17 : 9) : (nch>1 ? 32 : 17);
  823.  
  824.     profile_set(0);
  825.  
  826.     VDMPEGAudioHuffBitReader bits(mL3Buffer, mL3BufferPos - sideinfo.main_data_begin - (mFrameDataSize - sidelen));
  827.  
  828.     uint8 scalefac_l[2][22]={0};
  829.     uint8 scalefac_s[2][13][3]={0};
  830.  
  831.     // There are only 21 long bands and 12 short bands in MPEG audio, but
  832.     // data above the tables is considered to have a scalefactor of zero.
  833.     // The standard doesn't say whether short block reordering occurs --
  834.     // reference appears to do it, so we'll just pretend there is an
  835.     // extra band when dequantizing.
  836.  
  837.     static const unsigned sLongScalefactorBands[3][3][23]={
  838.         {
  839.             {0,4,8,12,16,20,24,30,36,44,52,62,74,90,110,134,162,196,238,288,342,418,576},    // 44.1KHz
  840.             {0,4,8,12,16,20,24,30,36,42,50,60,72,88,106,128,156,190,230,276,330,384,576},    // 48KHz
  841.             {0,4,8,12,16,20,24,30,36,44,54,66,82,102,126,156,194,240,296,364,448,550,576},    // 32KHz
  842.         },
  843.         {
  844.             {0,6,12,18,24,30,36,44,54,66,80,96,116,140,168,200,238,284,336,396,464,522,576},    // 22KHz
  845.             {0,6,12,18,24,30,36,44,54,66,80,96,114,136,162,194,232,278,332,394,464,540,576},    // 24KHz
  846.             {0,6,12,18,24,30,36,44,54,66,80,96,116,140,168,200,238,284,336,396,464,522,576},    // 16KHz
  847.         },
  848.         {
  849.             {0,6,12,18,24,30,36,44,54,66,80,96,116,140,168,200,238,284,336,396,464,522,576},        // 11KHz
  850.             {0,6,12,18,24,30,36,44,54,66,80,96,116,140,168,200,238,284,336,396,464,522,576},        // 12KHz
  851.             {0,12,24,36,48,60,72,88,108,132,160,192,232,280,336,400,476,566,568,570,572,574,576},    // 8KHz
  852.         }
  853.     };
  854.  
  855.     static const unsigned sShortScalefactorBands[3][3][14]={
  856.         {
  857.             {0,4,8,12,16,22,30,40,52,66,84,106,136,192},    // 44.1KHz
  858.             {0,4,8,12,16,22,28,38,50,64,80,100,126,192},    // 48KHz
  859.             {0,4,8,12,16,22,30,42,58,78,104,138,180,192},    // 32KHz
  860.         },
  861.         {
  862.             {0,4,8,12,18,24,32,42,56,74,100,132,174,192},    // 22KHz
  863.             {0,4,8,12,18,26,36,48,62,80,104,136,180,192},    // 24KHz
  864.             {0,4,8,12,18,26,36,48,62,80,104,134,174,192},    // 16KHz
  865.         },
  866.         {
  867.             {0,4,8,12,18,26,36,48,62,80,104,136,174,192},    // 11KHz
  868.             {0,4,8,12,18,26,36,48,62,80,104,136,174,192},    // 12KHz
  869.             {0,8,16,24,36,52,72,96,124,160,162,164,166,192},    // 8KHz
  870.         }
  871.     };
  872.  
  873.     unsigned mpegtype = mHeader.lSamplingFreq < 16000 ? 2 : is_mpeg2 ? 1 : 0;
  874.  
  875.     const unsigned *const pLongBands = sLongScalefactorBands[mpegtype][mSamplingRateIndex];
  876.     const unsigned *const pShortBands = sShortScalefactorBands[mpegtype][mSamplingRateIndex];
  877.  
  878.     for(unsigned gr=0; gr<(is_mpeg2 ? 1U : 2U); ++gr) {
  879.         float recon[2][576] = {0};
  880.         unsigned ch;
  881.         unsigned ms_bound;
  882.         unsigned is_band;
  883.         unsigned gr_zero_bound = 0;
  884.  
  885.         for(ch=0; ch<nch; ++ch) {
  886.             const LayerIIIRegionSideInfo& rsi = sideinfo.gr[gr][ch];
  887.             unsigned region_end = bits.pos() + rsi.part2_3_length;
  888.  
  889.             if (is_mpeg2)
  890.                 DecodeScalefactorsMPEG2(sideinfo.gr[gr][ch], bits, &scalefac_l[ch][0], &scalefac_s[ch][0][0], mModeExtension, ch);
  891.             else
  892.                 DecodeScalefactorsMPEG1(sideinfo, bits, &scalefac_l[ch][0], &scalefac_s[ch][0][0], gr, ch);
  893.  
  894.             VDASSERT(scalefac_l[ch][21] == 0);
  895.             VDASSERT(scalefac_s[ch][12][0] == 0);
  896.  
  897.             profile_add(p_scalefac);
  898.  
  899.             // decode big huffman-encoded samples -- three regions
  900.             sint32 freq[576+4];
  901.             unsigned region1_start;
  902.             unsigned region2_start;
  903.             unsigned count1_start    = 2*rsi.big_values;
  904.  
  905.             if (rsi.window_switching_flag) {
  906.                 const unsigned block_type = rsi.switched.block_type;
  907.  
  908.                 // Sadly, although the breakpoint is conveniently at frequency
  909.                 // line 36 in both cases in MPEG-1, in MPEG-2 the long window
  910.                 // has band 8 at line 56 instead, and in MPEG-2.5 the short
  911.                 // window breakpoint is at line 72.
  912.  
  913.                 region1_start = block_type == 2 ? pShortBands[3]*3 : pLongBands[8];
  914.                 region2_start = count1_start;
  915.             } else {
  916.                 region1_start = pLongBands[rsi.unswitched.region0_count + 1];
  917.                 region2_start = pLongBands[rsi.unswitched.region0_count + rsi.unswitched.region1_count + 2];
  918.             }
  919.  
  920.             if (region1_start > count1_start)
  921.                 region1_start = count1_start;
  922.             if (region2_start > count1_start)
  923.                 region2_start = count1_start;
  924.  
  925.             VDASSERT(bits.pos() <= region_end);
  926.             DecodeHuffmanValues(bits, freq, rsi.table_select[0], region1_start >> 1);
  927.             VDASSERT(bits.pos() <= region_end);
  928.             DecodeHuffmanValues(bits, freq + region1_start, rsi.table_select[1], (region2_start - region1_start) >> 1);
  929.             VDASSERT(bits.pos() <= region_end);
  930.             DecodeHuffmanValues(bits, freq + region2_start, rsi.table_select[2], (count1_start - region2_start) >> 1);
  931.             VDASSERT(bits.pos() <= region_end);
  932.  
  933.             profile_add(p_huffdec);
  934.  
  935.             // decode little huffman-encoded samples
  936.             
  937.             uint32 val = bits.peek(32);
  938.  
  939.             unsigned zero_bound;
  940.             {
  941.                 unsigned i = count1_start;
  942.  
  943.                 if (rsi.count1table_select) {        // inverted 4-bit table
  944.                     while(i<=576-4 && bits.pos() < region_end) {
  945.                         unsigned c = bits.get(4);
  946.                         int w=0, x=0, y=0, z=0;
  947.  
  948.                         if (!(c & 8))
  949.                             w = bits.get(1) ? -1 : 1;
  950.                         if (!(c & 4))
  951.                             x = bits.get(1) ? -1 : 1;
  952.                         if (!(c & 2))
  953.                             y = bits.get(1) ? -1 : 1;
  954.                         if (!(c & 1))
  955.                             z = bits.get(1) ? -1 : 1;
  956.  
  957.                         freq[i++] = w;
  958.                         freq[i++] = x;
  959.                         freq[i++] = y;
  960.                         freq[i++] = z;
  961.                     }
  962.                 } else {
  963.                     const L3HuffmanTableDescriptor& tablec1 = sL3HuffmanTables[32];
  964.  
  965.                     while(i<=576-4 && bits.pos() < region_end) {
  966.                         unsigned idx = 0;
  967.                         
  968.                         while(tablec1.table[idx][0])
  969.                             idx += tablec1.table[idx][bits.get(1)];
  970.  
  971.                         unsigned c = tablec1.table[idx][1];
  972.                         int w=0, x=0, y=0, z=0;
  973.  
  974.                         if (c & 8)
  975.                             w = bits.get(1) ? -1 : 1;
  976.                         if (c & 4)
  977.                             x = bits.get(1) ? -1 : 1;
  978.                         if (c & 2)
  979.                             y = bits.get(1) ? -1 : 1;
  980.                         if (c & 1)
  981.                             z = bits.get(1) ? -1 : 1;
  982.  
  983.                         freq[i++] = w;
  984.                         freq[i++] = x;
  985.                         freq[i++] = y;
  986.                         freq[i++] = z;
  987.                     }
  988.                 }
  989.  
  990.                 zero_bound = i;
  991.                 while(zero_bound > 0 && !freq[zero_bound-1])
  992.                     --zero_bound;
  993.  
  994.                 if (zero_bound > gr_zero_bound)
  995.                     gr_zero_bound = zero_bound;
  996.  
  997.                 if (ch) {
  998.                     unsigned sfb;
  999.  
  1000.                     // hmm... what band is it in?
  1001.  
  1002.                     if (rsi.window_switching_flag && rsi.switched.block_type==2) {
  1003.                         if (rsi.switched.mixed_block_flag && zero_bound <= 36) {    // mixed block - 35 scalefactor bands
  1004.                             const unsigned long_bands = is_mpeg2 ? 6 : 8;
  1005.  
  1006.                             for(sfb = 0; sfb < long_bands; ++sfb)
  1007.                                 if (zero_bound <= pLongBands[sfb])
  1008.                                     break;
  1009.  
  1010.                             ms_bound = pLongBands[sfb];
  1011.                             is_band = sfb;
  1012.                         } else {                                // short block - 12 scalefactor bands + end band
  1013.                             for(sfb = 0; sfb < 13; ++sfb)
  1014.                                 if (zero_bound <= pShortBands[sfb]*3)
  1015.                                     break;
  1016.  
  1017.                             ms_bound = pShortBands[sfb]*3;
  1018.                             is_band = sfb;
  1019.                         }
  1020.                     } else {                                    // long block - 21 scalefactor bands + end band
  1021.                         for(sfb = 0; sfb < 22; ++sfb)
  1022.                             if (zero_bound <= pLongBands[sfb])
  1023.                                 break;
  1024.  
  1025.                         ms_bound = pLongBands[sfb];
  1026.                         is_band = sfb;
  1027.                     }
  1028.                 }
  1029.  
  1030.                 if (i < 576)
  1031.                     memset(freq+i, 0, sizeof(freq[0])*(576-i));
  1032.  
  1033.                 VDASSERT(i >= 574 || bits.pos() == region_end);
  1034.  
  1035.                 bits.seek(region_end);
  1036.             }
  1037. #if 0
  1038.             {
  1039.                 unsigned chk = 0;
  1040.                 for(unsigned v=0; v<576; ++v) {
  1041.                     chk = (chk<<1) + (chk>>31) + freq[v];
  1042.                 }
  1043.                 VDDEBUG2("frame[%d].gr[%d].ch[%d]: checksum=%x\n", frameno, gr, ch, chk);
  1044.             }
  1045. #endif
  1046.  
  1047.             profile_add(p_huffdec1);
  1048.  
  1049.             // requantization
  1050.  
  1051.             static const float pow43_tab[64]={
  1052.                 -101.593667325965f,
  1053.                 -97.3828002241332f,
  1054.                 -93.2169751786157f,
  1055.                 -89.0971879448896f,
  1056.                 -85.0244912125185f,
  1057.                 -81.0f,
  1058.                 -77.0248977785916f,
  1059.                 -73.1004434553216f,
  1060.                 -69.2279793747556f,
  1061.                 -65.408940536586f,
  1062.                 -61.6448652744185f,
  1063.                 -57.9374077040035f,
  1064.                 -54.2883523318981f,
  1065.                 -50.6996313257169f,
  1066.                 -47.1733450957601f,
  1067.                 -43.71178704119f,
  1068.                 -40.3174735966359f,
  1069.                 -36.993181114957f,
  1070.                 -33.7419916984532f,
  1071.                 -30.5673509403698f,
  1072.                 -27.47314182128f,
  1073.                 -24.4637809962625f,
  1074.                 -21.5443469003188f,
  1075.                 -18.7207544074671f,
  1076.                 -16.0f,
  1077.                 -13.3905182794067f,
  1078.                 -10.9027235569928f,
  1079.                 -8.54987973338348f,
  1080.                 -6.3496042078728f,
  1081.                 -4.32674871092222f,
  1082.                 -2.51984209978975f,
  1083.                 -1.0f,
  1084.                 0.0f,
  1085.                 1.0f,
  1086.                 2.51984209978975f,
  1087.                 4.32674871092222f,
  1088.                 6.3496042078728f,
  1089.                 8.54987973338348f,
  1090.                 10.9027235569928f,
  1091.                 13.3905182794067f,
  1092.                 16.0f,
  1093.                 18.7207544074671f,
  1094.                 21.5443469003188f,
  1095.                 24.4637809962625f,
  1096.                 27.47314182128f,
  1097.                 30.5673509403698f,
  1098.                 33.7419916984532f,
  1099.                 36.993181114957f,
  1100.                 40.3174735966359f,
  1101.                 43.71178704119f,
  1102.                 47.1733450957601f,
  1103.                 50.6996313257169f,
  1104.                 54.2883523318981f,
  1105.                 57.9374077040035f,
  1106.                 61.6448652744185f,
  1107.                 65.408940536586f,
  1108.                 69.2279793747556f,
  1109.                 73.1004434553216f,
  1110.                 77.0248977785916f,
  1111.                 81.0f,
  1112.                 85.0244912125185f,
  1113.                 89.0971879448896f,
  1114.                 93.2169751786157f,
  1115.                 97.3828002241332f,
  1116.             };
  1117.  
  1118.             {
  1119.                 static const uint8 pretab[22]={0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,2,2,3,3,3,2};
  1120.  
  1121.                 bool short_blocks = rsi.window_switching_flag && rsi.switched.block_type == 2;
  1122.                 unsigned long_bands = short_blocks ? rsi.switched.mixed_block_flag ? is_mpeg2 ? 6 : 8 : 0 : 21;
  1123.                 float global_gain = (float)pow(2.0, 0.25*(int)(rsi.global_gain - 210));
  1124.                 float scalefac_scale = rsi.scalefac_scale ? -1.0f : -0.5f;
  1125.                 float gain;
  1126.  
  1127.                 // long bands
  1128.                 unsigned i = 0;
  1129.  
  1130.                 for(unsigned sfb=0; sfb<long_bands; ++sfb) {
  1131.                     unsigned band_end = pLongBands[sfb+1];
  1132.  
  1133.                     if (band_end > zero_bound)
  1134.                         band_end = zero_bound;
  1135.  
  1136.                     int sf = scalefac_l[ch][sfb];
  1137.  
  1138.                     if (rsi.preflag)
  1139.                         sf += pretab[sfb];
  1140.  
  1141.                     gain = global_gain * powf(2.0f, scalefac_scale * sf);
  1142.  
  1143.                     if (i >= count1_start) {
  1144.                         const float recon_tab[3]={-gain, 0.f, +gain};
  1145.  
  1146.                         for(; i < band_end; i += 2) {
  1147.                             recon[ch][i  ] = recon_tab[freq[i  ]+1];
  1148.                             recon[ch][i+1] = recon_tab[freq[i+1]+1];
  1149.                         }
  1150.                     } else {
  1151.                         for(; i < band_end; ++i) {
  1152.                             VDASSERT(abs(freq[i]) < 8192);
  1153.  
  1154.                             sint32 x = freq[i];
  1155.  
  1156.                             if ((unsigned)(x+32) < 64)
  1157.                                 recon[ch][i] = pow43_tab[x+32] * gain;
  1158.                             else {
  1159.                                 float y = (float)pow(fabs((double)x), 4.0/3.0) * gain;
  1160.  
  1161.                                 if (freq[i]<0)
  1162.                                     y = -y;
  1163.  
  1164.                                 recon[ch][i] = (float)y;
  1165.                             }
  1166.                         }
  1167.                     }
  1168.                 }
  1169.  
  1170.                 if (short_blocks) {
  1171.                     int sfb;
  1172.  
  1173.                     if (rsi.switched.mixed_block_flag)
  1174.                         sfb = 3;
  1175.                     else
  1176.                         sfb = 0;
  1177.  
  1178.                     for(; sfb < 13; ++sfb) {
  1179.                         gain = global_gain;
  1180.  
  1181.                         unsigned i = pShortBands[sfb]*3;
  1182.  
  1183.                         for(unsigned window=0; window<3; ++window) {
  1184.                             int sf = scalefac_s[ch][sfb][window];
  1185.  
  1186.                             gain = global_gain * powf(2.0f, scalefac_scale * sf - 2*rsi.switched.subblock_gain[window]);
  1187.  
  1188.                             unsigned j = pShortBands[sfb]*3 + window;
  1189.  
  1190.                             while(j < pShortBands[sfb+1]*3) {
  1191.                                 float y = (float)pow(fabs((double)freq[i]), 4.0/3.0) * gain;
  1192.  
  1193.                                 VDASSERT(abs(freq[i]) < 8192);
  1194.  
  1195.                                 if (freq[i]<0)
  1196.                                     y = -y;
  1197.  
  1198.                                 recon[ch][j] = (float)y;
  1199. //                                printf("recon[%d][%d] = %g\n", ch, i, y);
  1200.                                 ++i;
  1201.                                 j += 3;
  1202.                             }
  1203.                         }
  1204.                     }
  1205.                 }
  1206.  
  1207. #if 0
  1208.             {
  1209.                 double chk = 0;
  1210.                 for(unsigned v=0; v<576; ++v) {
  1211.                     chk += recon[ch][v];
  1212.                 }
  1213.                 VDDEBUG2("frame[%d].gr[%d].ch[%d]: checksum=%g\n", frameno, gr, ch, chk * 32768.0);
  1214.             }
  1215. #endif
  1216.             }
  1217.  
  1218.             profile_add(p_dequan);
  1219.  
  1220.         }
  1221.  
  1222.         // stereo processing
  1223.         //
  1224.         // We can do this after reordering because short block reordering
  1225.         // swaps around samples within a scalefactor band, and the switch
  1226.         // from MS to IS only occurs between bands.
  1227.         //
  1228.         // XXX: There is still a bug here with MPEG-2 intensity stereo
  1229.         //        decoding -- specifically, the zero bound that switches
  1230.         //        IS on is per-window in MPEG-2, but this code only handles
  1231.         //        MPEG-1 mode. MPEG-2 files with intensity stereo are a
  1232.         //        bit hard to come by since LAME doesn't use IS....
  1233.  
  1234.         if (mMode == 1) {                    // joint stereo mode -- mode_ext = MS/IS
  1235.             static const float invsqrt2 = 0.70710678118654752440084436210485f;
  1236.  
  1237.             if (mModeExtension & 2) {        // mid/side stereo mode enabled
  1238.                 if (mModeExtension == 2)
  1239.                     ms_bound = 576;
  1240.  
  1241.                 for(unsigned i=0; i<ms_bound; ++i) {
  1242.                     const float x = recon[0][i] * invsqrt2;
  1243.                     const float y = recon[1][i] * invsqrt2;
  1244.  
  1245.                     recon[0][i] = (float)(x+y);
  1246.                     recon[1][i] = (float)(x-y);
  1247.                 }
  1248.             }
  1249.  
  1250.             if (mModeExtension & 1) {        // intensity stereo mode enabled
  1251.  
  1252.                 // Unfortunately, 11172-3 doesn't say whether IS positions
  1253.                 // above 7 are valid; they are reachable in low subbands
  1254.                 // with a low ms_bound.  Most of them are benign except for
  1255.                 // 9, which has infinite gain -- presumably this is the
  1256.                 // "I'm yelling inside your head" position.  Oh well.
  1257.  
  1258.                 static const float is_left_tab[2][16]={
  1259.                     {    // MPEG-1
  1260.                         0.0f,
  1261.                         0.211324865405187f,
  1262.                         0.366025403784439f,
  1263.                         0.5f,
  1264.                         0.633974596215561f,
  1265.                         0.788675134594813f,
  1266.                         1.0f,
  1267.                         1.36602540378444f,        // 7 (illegal)
  1268.                         2.36602540378444f,
  1269.                         0.0f,                    // 9 - -1/0.  Ugh.
  1270.                         -1.36602540378444f,
  1271.                         -0.366025403784439f,
  1272.                         0.0f,
  1273.                         0.211324865405187f,
  1274.                         0.366025403784439f,
  1275.                         0.5f,
  1276.                     },
  1277.                     {    // MPEG-2            powers of 1/root-root-2
  1278.                         1.00000000000000,
  1279.                         0.84089641525371,
  1280.                         1.0,
  1281.                         0.70710678118655,
  1282.                         1.0,
  1283.                         0.59460355750136,
  1284.                         1.0,
  1285.                         0.50000000000000,
  1286.                         1.0,
  1287.                         0.42044820762686,
  1288.                         1.0,
  1289.                         0.35355339059327,
  1290.                         1.0,
  1291.                         0.29730177875068,
  1292.                         1.0,
  1293.                         0.21022410381343,
  1294.                     }
  1295.                 };
  1296.  
  1297.                 static const float is_right_tab[2][16]={
  1298.                     {    // MPEG-1
  1299.                         1.0f,
  1300.                         0.788675134594813f,
  1301.                         0.633974596215561f,
  1302.                         0.5f,
  1303.                         0.366025403784439f,
  1304.                         0.211324865405187f,
  1305.                         0.0f,
  1306.                         -0.366025403784438f,        // 7 (illegal)
  1307.                         -1.36602540378444f,
  1308.                         0.0f,                    // 9 - 1/0.  Ugh.
  1309.                         2.36602540378444f,
  1310.                         1.36602540378444f,
  1311.                         1.0f,
  1312.                         0.788675134594813f,
  1313.                         0.633974596215562f,
  1314.                         0.5f,
  1315.                     },
  1316.                     {    // MPEG-2            powers of 1/root-root-2
  1317.                         1.00000000000000,
  1318.                         1.0,
  1319.                         0.84089641525371,
  1320.                         1.0,
  1321.                         0.70710678118655,
  1322.                         1.0,
  1323.                         0.59460355750136,
  1324.                         1.0,
  1325.                         0.50000000000000,
  1326.                         1.0,
  1327.                         0.42044820762686,
  1328.                         1.0,
  1329.                         0.35355339059327,
  1330.                         1.0,
  1331.                         0.29730177875068,
  1332.                         1.0,
  1333.                     }
  1334.                 };
  1335.  
  1336.                 // PITA: IS can occur in short blocks, and we need to know that
  1337.                 // since the scalefactors are involved.  The right channel
  1338.                 // determines the switch.
  1339.  
  1340.                 const LayerIIIRegionSideInfo& rsi = sideinfo.gr[gr][1];
  1341.  
  1342.                 unsigned long_band_start = 0;
  1343.                 unsigned long_band_end = 0;
  1344.                 unsigned short_band_start = 0;
  1345.                 unsigned short_band_end = 0;
  1346.  
  1347.                 // Frequency lines above the last scalefactor band inherit is_pos[]
  1348.                 // from it.
  1349.  
  1350.                 if (rsi.window_switching_flag && rsi.switched.block_type == 2) {
  1351.                     if (rsi.switched.mixed_block_flag) {
  1352.                         if (ms_bound <= pLongBands[8]) {
  1353.                             long_band_start        = is_band;
  1354.                             long_band_end        = 8;
  1355.                             short_band_start    = 3;
  1356.                         } else {
  1357.                             short_band_start    = is_band;
  1358.                         }
  1359.                         short_band_end        = 13;
  1360.                     } else {
  1361.                         short_band_start    = is_band;
  1362.                         short_band_end        = 13;
  1363.                     }
  1364.                     scalefac_s[1][12][0] = scalefac_s[1][11][0];
  1365.                     scalefac_s[1][12][1] = scalefac_s[1][11][1];
  1366.                     scalefac_s[1][12][2] = scalefac_s[1][11][2];
  1367.                 } else {
  1368.                     // dist10 appears to have a bug -- long blocks can't have IS in sfband 0.
  1369.                     if (!is_band)
  1370.                         ++is_band;
  1371.  
  1372.                     long_band_start = is_band;
  1373.                     long_band_end    = 22;
  1374.  
  1375.                     scalefac_l[1][21] = scalefac_l[1][20];
  1376.                 }
  1377.  
  1378.                 unsigned sfb;
  1379.  
  1380.                 for(sfb=long_band_start; sfb<long_band_end; ++sfb) {
  1381.                     const unsigned is_pos = scalefac_l[1][sfb];
  1382.  
  1383.                     if (is_pos != 7) {
  1384.                         const unsigned band_start = pLongBands[sfb];
  1385.                         const unsigned band_end = pLongBands[sfb+1];
  1386.                         float coleft = is_left_tab[is_mpeg2][is_pos];
  1387.                         float coright = is_right_tab[is_mpeg2][is_pos];
  1388.  
  1389.                         if (is_mpeg2 && (sideinfo.gr[gr][1].scalefac_compress & 1)) {
  1390.                             coleft *= coleft;
  1391.                             coright *= coright;
  1392.                         }
  1393.  
  1394.                         for(unsigned i = band_start; i<band_end; ++i) {
  1395.                             const float x = recon[0][i];
  1396.  
  1397.                             recon[0][i] = x*coleft;
  1398.                             recon[1][i] = x*coright;
  1399.                         }
  1400.                     } else if (mModeExtension & 2) {
  1401.                         const unsigned band_start = pLongBands[sfb];
  1402.                         const unsigned band_end = pLongBands[sfb+1];
  1403.  
  1404.                         for(unsigned i = band_start; i<band_end; ++i) {
  1405.                             const float x = recon[0][i] * invsqrt2;
  1406.                             const float y = recon[1][i] * invsqrt2;
  1407.  
  1408.                             recon[0][i] = x+y;
  1409.                             recon[1][i] = x-y;
  1410.                         }
  1411.                     }
  1412.                 }
  1413.  
  1414.                 for(sfb=short_band_start; sfb<short_band_end; ++sfb) {
  1415.                     for(unsigned window=0; window<3; ++window) {
  1416.                         const unsigned is_pos = scalefac_s[1][sfb][window];
  1417.  
  1418.                         if (is_pos != 7) {
  1419.                             const unsigned band_start = pShortBands[sfb]*3 + window;
  1420.                             const unsigned band_end = pShortBands[sfb+1]*3 + window;
  1421.                             float coleft = is_left_tab[is_mpeg2][is_pos];
  1422.                             float coright = is_right_tab[is_mpeg2][is_pos];
  1423.  
  1424.                             if (is_mpeg2 && (sideinfo.gr[gr][1].scalefac_compress & 1)) {
  1425.                                 coleft *= coleft;
  1426.                                 coright *= coright;
  1427.                             }
  1428.  
  1429.                             for(unsigned i=band_start; i<band_end; i+=3) {
  1430.                                 const float x = recon[0][i];
  1431.  
  1432.                                 recon[0][i] = x*coleft;
  1433.                                 recon[1][i] = x*coright;
  1434.                             }
  1435.                         } else if (mModeExtension & 2) {
  1436.                             const unsigned band_start = pShortBands[sfb]*3 + window;
  1437.                             const unsigned band_end = pShortBands[sfb+1]*3 + window;
  1438.  
  1439.                             for(unsigned i=band_start; i<band_end; i+=3) {
  1440.                                 const float x = recon[0][i] * invsqrt2;
  1441.                                 const float y = recon[1][i] * invsqrt2;
  1442.  
  1443.                                 recon[0][i] = (float)(x+y);
  1444.                                 recon[1][i] = (float)(x-y);
  1445.                             }
  1446.                         }
  1447.                     }
  1448.                 }
  1449.             }
  1450.         }
  1451.  
  1452.         profile_add(p_stereo);
  1453.  
  1454.         // alias reduction, IMDCT and polyphase
  1455.  
  1456.         float subbands[18][2][32];
  1457.  
  1458.         for(ch=0; ch<nch; ++ch) {
  1459.             const LayerIIIRegionSideInfo& rsi = sideinfo.gr[gr][ch];
  1460.             unsigned wintype = 0;
  1461.             
  1462.             if (rsi.window_switching_flag)
  1463.                 wintype = rsi.switched.block_type;
  1464.  
  1465.             for(unsigned sb=0; sb<32; ++sb)    {
  1466.                 if (wintype == 2 && (!rsi.switched.mixed_block_flag || sb >= 2)) {
  1467.                     IMDCT_6_3(&recon[ch][sb*18], (float(*)[2][32])&subbands[0][ch][sb], mL3OverlapBuffer[ch][sb], mL3Windows[wintype]);
  1468.                 } else {
  1469.                     if (sb < 31) {
  1470.                         static const float cs_tab[8]={
  1471.                             0.85749292571254f,
  1472.                             0.88174199731771f,
  1473.                             0.94962864910273f,
  1474.                             0.98331459249179f,
  1475.                             0.99551781606759f,
  1476.                             0.99916055817815f,
  1477.                             0.99989919524445f,
  1478.                             0.99999315507028f
  1479.                         };
  1480.  
  1481.                         static const float ca_tab[8]={
  1482.                             -0.51449575542753f,
  1483.                             -0.47173196856497f,
  1484.                             -0.31337745420390f,
  1485.                             -0.18191319961098f,
  1486.                             -0.09457419252642f,
  1487.                             -0.04096558288530f,
  1488.                             -0.01419856857247f,
  1489.                             -0.00369997467376f
  1490.                         };
  1491.  
  1492.                         float *p = &recon[ch][sb*18+18];
  1493.  
  1494.                         struct local {
  1495.                             static inline void antialias(float& x, float& y, float cs, float ca) {
  1496.                                 const float xt = x, yt = y;
  1497.  
  1498.                                 x = xt*cs - yt*ca;
  1499.                                 y = xt*ca + yt*cs;
  1500.                             }
  1501.                         };
  1502.  
  1503.                         local::antialias(p[-1], p[0], cs_tab[0], ca_tab[0]);
  1504.                         local::antialias(p[-2], p[1], cs_tab[1], ca_tab[1]);
  1505.                         local::antialias(p[-3], p[2], cs_tab[2], ca_tab[2]);
  1506.                         local::antialias(p[-4], p[3], cs_tab[3], ca_tab[3]);
  1507.                         local::antialias(p[-5], p[4], cs_tab[4], ca_tab[4]);
  1508.                         local::antialias(p[-6], p[5], cs_tab[5], ca_tab[5]);
  1509.                         local::antialias(p[-7], p[6], cs_tab[6], ca_tab[6]);
  1510.                         local::antialias(p[-8], p[7], cs_tab[7], ca_tab[7]);
  1511.  
  1512. #if 0
  1513.                         for(unsigned i=0; i<8; ++i) {
  1514.                             const double x = recon[ch][sb*18+17-i];
  1515.                             const double y = recon[ch][sb*18+18+i];
  1516.                             const double cs = cs_tab[i];
  1517.                             const double ca = ca_tab[i];
  1518.  
  1519.                             recon[ch][sb*18+17-i] = x*cs - y*ca;
  1520.                             recon[ch][sb*18+18+i] = x*ca + y*cs;
  1521.                         }
  1522. #endif
  1523.                     }
  1524.  
  1525.                     if (sb*18 >= gr_zero_bound+35) {
  1526.                         IMDCT_18_Null((float(*)[2][32])&subbands[0][ch][sb], mL3OverlapBuffer[ch][sb]);
  1527.                     } else {
  1528.                         IMDCT_18(&recon[ch][sb*18], (float(*)[2][32])&subbands[0][ch][sb], mL3OverlapBuffer[ch][sb], mL3Windows[wintype]);
  1529.                     }
  1530.                 }
  1531.             }
  1532.         }
  1533.  
  1534.         profile_add(p_hybrid);
  1535.  
  1536.         for(unsigned s=0; s<18; ++s) {
  1537.             if (s & 1) {
  1538.                 for(unsigned sb=1; sb<32; sb+=2) {
  1539.                     subbands[s][0][sb] = -subbands[s][0][sb];
  1540.                     subbands[s][1][sb] = -subbands[s][1][sb];
  1541.                 }
  1542.             }
  1543.  
  1544.             if (nch>1) {
  1545.                 mpPolyphaseFilter->Generate(subbands[s][0], subbands[s][1], mpSampleDst);
  1546.                 mpSampleDst += 64;
  1547.             } else {
  1548.                 mpPolyphaseFilter->Generate(subbands[s][0], NULL, mpSampleDst);
  1549.                 mpSampleDst += 32;
  1550.             }
  1551.         }
  1552.  
  1553.         profile_add(p_polyphase);
  1554.     }
  1555.  
  1556. #ifdef RDTSC_PROFILE
  1557.  
  1558.     if (!(++p_frames & 127)) {
  1559.         static char buf[256];
  1560.  
  1561.         sprintf(buf, "%d frames: total %I64d, scalefac %d%%, huffdec %d%%/%d%%, dequan %d%%, stereo %d%%, hybrid %d%% (%lu), poly %d%%\n"
  1562.                 ,p_frames
  1563.                 ,p_total
  1564.                 ,(long)((p_scalefac*100)/p_total)
  1565.                 ,(long)((p_huffdec*100)/p_total)
  1566.                 ,(long)((p_huffdec1*100)/p_total)
  1567.                 ,(long)((p_dequan*100)/p_total)
  1568.                 ,(long)((p_stereo*100)/p_total)
  1569.                 ,(long)((p_hybrid*100)/p_total)
  1570.                 ,(long)p_hybrid
  1571.                 ,(long)((p_polyphase*100)/p_total)
  1572.                 );
  1573.         OutputDebugString(buf);
  1574.     }
  1575. #endif
  1576.  
  1577.     mSamplesDecoded += (is_mpeg2 ? 576 : 1152) * nch;
  1578.  
  1579.     return false;
  1580. }
  1581.